<!DOCTYPE HTML>
<html lang="en" class="light" dir="ltr">
    <head>
        <!-- Book generated using mdBook -->
        <meta charset="UTF-8">
        <title>Compressing - PISA Guide</title>


        <!-- Custom HTML head -->
        
        <meta name="description" content="">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <meta name="theme-color" content="#ffffff">

        <link rel="icon" href="../favicon.svg">
        <link rel="shortcut icon" href="../favicon.png">
        <link rel="stylesheet" href="../css/variables.css">
        <link rel="stylesheet" href="../css/general.css">
        <link rel="stylesheet" href="../css/chrome.css">
        <link rel="stylesheet" href="../css/print.css" media="print">

        <!-- Fonts -->
        <link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
        <link rel="stylesheet" href="../fonts/fonts.css">

        <!-- Highlight.js Stylesheets -->
        <link rel="stylesheet" href="../highlight.css">
        <link rel="stylesheet" href="../tomorrow-night.css">
        <link rel="stylesheet" href="../ayu-highlight.css">

        <!-- Custom theme stylesheets -->

    </head>
    <body class="sidebar-visible no-js">
    <div id="body-container">
        <!-- Provide site root to javascript -->
        <script>
            var path_to_root = "../";
            var default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? "navy" : "light";
        </script>

        <!-- Work around some values being stored in localStorage wrapped in quotes -->
        <script>
            try {
                var theme = localStorage.getItem('mdbook-theme');
                var sidebar = localStorage.getItem('mdbook-sidebar');

                if (theme.startsWith('"') && theme.endsWith('"')) {
                    localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
                }

                if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
                    localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
                }
            } catch (e) { }
        </script>

        <!-- Set the theme before any content is loaded, prevents flash -->
        <script>
            var theme;
            try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
            if (theme === null || theme === undefined) { theme = default_theme; }
            var html = document.querySelector('html');
            html.classList.remove('light')
            html.classList.add(theme);
            var body = document.querySelector('body');
            body.classList.remove('no-js')
            body.classList.add('js');
        </script>

        <input type="checkbox" id="sidebar-toggle-anchor" class="hidden">

        <!-- Hide / unhide sidebar before it is displayed -->
        <script>
            var body = document.querySelector('body');
            var sidebar = null;
            var sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
            if (document.body.clientWidth >= 1080) {
                try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
                sidebar = sidebar || 'visible';
            } else {
                sidebar = 'hidden';
            }
            sidebar_toggle.checked = sidebar === 'visible';
            body.classList.remove('sidebar-visible');
            body.classList.add("sidebar-" + sidebar);
        </script>

        <nav id="sidebar" class="sidebar" aria-label="Table of contents">
            <div class="sidebar-scrollbox">
                <ol class="chapter"><li class="chapter-item expanded affix "><a href="../introduction.html">Introduction</a></li><li class="chapter-item expanded affix "><li class="part-title">User Guide</li><li class="chapter-item expanded "><a href="../guide/requirements.html"><strong aria-hidden="true">1.</strong> Requirements</a></li><li class="chapter-item expanded "><a href="../guide/installation.html"><strong aria-hidden="true">2.</strong> Installation</a></li><li class="chapter-item expanded "><a href="../guide/indexing-pipeline.html"><strong aria-hidden="true">3.</strong> Indexing Pipeline</a></li><li class="chapter-item expanded "><a href="../guide/parsing.html"><strong aria-hidden="true">4.</strong> Parsing</a></li><li class="chapter-item expanded "><a href="../guide/inverting.html"><strong aria-hidden="true">5.</strong> Inverting</a></li><li class="chapter-item expanded "><a href="../guide/compressing.html" class="active"><strong aria-hidden="true">6.</strong> Compressing</a></li><li class="chapter-item expanded "><a href="../guide/wand_data.html"><strong aria-hidden="true">7.</strong> "WAND" Data</a></li><li class="chapter-item expanded "><a href="../guide/querying.html"><strong aria-hidden="true">8.</strong> Querying</a></li><li class="chapter-item expanded "><a href="../guide/algorithms.html"><strong aria-hidden="true">9.</strong> Retrieval Algorithms</a></li><li class="chapter-item expanded "><a href="../guide/reordering.html"><strong aria-hidden="true">10.</strong> Document Reordering</a></li><li class="chapter-item expanded "><a href="../guide/sharding.html"><strong aria-hidden="true">11.</strong> Sharding</a></li><li class="chapter-item expanded "><a href="../guide/threshold-estimation.html"><strong aria-hidden="true">12.</strong> Threshold Estimation</a></li><li class="chapter-item expanded affix "><li class="part-title">Tutorials</li><li class="chapter-item expanded "><a href="../tutorial/robust04.html"><strong aria-hidden="true">13.</strong> Regression test for Robust04</a></li><li class="chapter-item expanded affix "><li class="part-title">CLI Reference</li><li class="chapter-item expanded "><a href="../cli/compress_inverted_index.html"><strong aria-hidden="true">14.</strong> compress_inverted_index</a></li><li class="chapter-item expanded "><a href="../cli/compute_intersection.html"><strong aria-hidden="true">15.</strong> compute_intersection</a></li><li class="chapter-item expanded "><a href="../cli/count-postings.html"><strong aria-hidden="true">16.</strong> count-postings</a></li><li class="chapter-item expanded "><a href="../cli/create_wand_data.html"><strong aria-hidden="true">17.</strong> create_wand_data</a></li><li class="chapter-item expanded "><a href="../cli/evaluate_queries.html"><strong aria-hidden="true">18.</strong> evaluate_queries</a></li><li class="chapter-item expanded "><a href="../cli/extract-maxscores.html"><strong aria-hidden="true">19.</strong> extract-maxscores</a></li><li class="chapter-item expanded "><a href="../cli/extract_topics.html"><strong aria-hidden="true">20.</strong> extract_topics</a></li><li class="chapter-item expanded "><a href="../cli/invert.html"><strong aria-hidden="true">21.</strong> invert</a></li><li class="chapter-item expanded "><a href="../cli/kth_threshold.html"><strong aria-hidden="true">22.</strong> kth_threshold</a></li><li class="chapter-item expanded "><a href="../cli/lexicon.html"><strong aria-hidden="true">23.</strong> lexicon</a></li><li class="chapter-item expanded "><a href="../cli/map_queries.html"><strong aria-hidden="true">24.</strong> map_queries</a></li><li class="chapter-item expanded "><a href="../cli/parse_collection.html"><strong aria-hidden="true">25.</strong> parse_collection</a></li><li class="chapter-item expanded "><a href="../cli/partition_fwd_index.html"><strong aria-hidden="true">26.</strong> partition_fwd_index</a></li><li class="chapter-item expanded "><a href="../cli/queries.html"><strong aria-hidden="true">27.</strong> queries</a></li><li class="chapter-item expanded "><a href="../cli/read_collection.html"><strong aria-hidden="true">28.</strong> read_collection</a></li><li class="chapter-item expanded "><a href="../cli/reorder-docids.html"><strong aria-hidden="true">29.</strong> reorder-docids</a></li><li class="chapter-item expanded "><a href="../cli/sample_inverted_index.html"><strong aria-hidden="true">30.</strong> sample_inverted_index</a></li><li class="chapter-item expanded "><a href="../cli/selective_queries.html"><strong aria-hidden="true">31.</strong> selective_queries</a></li><li class="chapter-item expanded "><a href="../cli/shards.html"><strong aria-hidden="true">32.</strong> shards</a></li><li class="chapter-item expanded "><a href="../cli/stem_queries.html"><strong aria-hidden="true">33.</strong> stem_queries</a></li><li class="chapter-item expanded "><a href="../cli/taily-stats.html"><strong aria-hidden="true">34.</strong> taily-stats</a></li><li class="chapter-item expanded "><a href="../cli/taily-thresholds.html"><strong aria-hidden="true">35.</strong> taily-thresholds</a></li><li class="chapter-item expanded "><a href="../cli/thresholds.html"><strong aria-hidden="true">36.</strong> thresholds</a></li><li class="chapter-item expanded affix "><li class="part-title">Specifications</li><li class="chapter-item expanded "><a href="../specs/lookup-table.html"><strong aria-hidden="true">37.</strong> Lookup Table</a></li></ol>
            </div>
            <div id="sidebar-resize-handle" class="sidebar-resize-handle"></div>
        </nav>

        <!-- Track and set sidebar scroll position -->
        <script>
            var sidebarScrollbox = document.querySelector('#sidebar .sidebar-scrollbox');
            sidebarScrollbox.addEventListener('click', function(e) {
                if (e.target.tagName === 'A') {
                    sessionStorage.setItem('sidebar-scroll', sidebarScrollbox.scrollTop);
                }
            }, { passive: true });
            var sidebarScrollTop = sessionStorage.getItem('sidebar-scroll');
            sessionStorage.removeItem('sidebar-scroll');
            if (sidebarScrollTop) {
                // preserve sidebar scroll position when navigating via links within sidebar
                sidebarScrollbox.scrollTop = sidebarScrollTop;
            } else {
                // scroll sidebar to current active section when navigating via "next/previous chapter" buttons
                var activeSection = document.querySelector('#sidebar .active');
                if (activeSection) {
                    activeSection.scrollIntoView({ block: 'center' });
                }
            }
        </script>

        <div id="page-wrapper" class="page-wrapper">

            <div class="page">
                                <div id="menu-bar-hover-placeholder"></div>
                <div id="menu-bar" class="menu-bar sticky">
                    <div class="left-buttons">
                        <label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
                            <i class="fa fa-bars"></i>
                        </label>
                        <button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
                            <i class="fa fa-paint-brush"></i>
                        </button>
                        <ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
                            <li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
                        </ul>
                        <button id="search-toggle" class="icon-button" type="button" title="Search. (Shortkey: s)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="S" aria-controls="searchbar">
                            <i class="fa fa-search"></i>
                        </button>
                    </div>

                    <h1 class="menu-title">PISA Guide</h1>

                    <div class="right-buttons">
                        <a href="../print.html" title="Print this book" aria-label="Print this book">
                            <i id="print-button" class="fa fa-print"></i>
                        </a>

                    </div>
                </div>

                <div id="search-wrapper" class="hidden">
                    <form id="searchbar-outer" class="searchbar-outer">
                        <input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
                    </form>
                    <div id="searchresults-outer" class="searchresults-outer hidden">
                        <div id="searchresults-header" class="searchresults-header"></div>
                        <ul id="searchresults">
                        </ul>
                    </div>
                </div>

                <!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
                <script>
                    document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
                    document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
                    Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
                        link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
                    });
                </script>

                <div id="content" class="content">
                    <main>
                        <h1 id="compressing"><a class="header" href="#compressing">Compressing</a></h1>
<p>To create an index use the command <code>compress_inverted_index</code>. The
available index types are listed in <code>index_types.hpp</code>.</p>
<p>For example, to create an index using the optimal partitioning
algorithm, using the test collection, execute the command:</p>
<pre><code>$ ./bin/compress_inverted_index -t opt \
    -c ../test/test_data/test_collection \
    -o test_collection.index.opt \
    --check
</code></pre>
<p>where <code>test/test_data/test_collection</code> is the <em>basename</em> of the
collection, that is the name without the <code>.{docs,freqs,sizes}</code>
extensions, and <code>test_collection.index.opt</code> is the filename of the
output index. <code>--check</code> will trigger a verification step to check the
correctness of the index.</p>
<h2 id="compression-algorithms"><a class="header" href="#compression-algorithms">Compression Algorithms</a></h2>
<h3 id="binary-interpolative-coding"><a class="header" href="#binary-interpolative-coding">Binary Interpolative Coding</a></h3>
<p>Binary Interpolative Coding (BIC) directly encodes a monotonically
increasing sequence. At each step of this recursive algorithm, the
middle element <em>m</em> is encoded by a number <em>m − l − p</em>, where <em>l</em> is the
lowest value and <em>p</em> is the position of <em>m</em> in the currently encoded
sequence. Then we recursively encode the values to the left and right of
<em>m</em>. BIC encodings are very space-efficient, particularly on clustered
data; however, decoding is relatively slow.</p>
<p>To compress an index using BIC use the index type <code>block_interpolative</code>.</p>
<blockquote>
<p>Alistair Moffat, Lang Stuiver: Binary Interpolative Coding for Effective Index Compression. Inf. Retr. 3(1): 25-47 (2000)</p>
</blockquote>
<h3 id="elias-fano"><a class="header" href="#elias-fano">Elias-Fano</a></h3>
<p>Given a monotonically increasing integer sequence <em>S</em> of size <em>n</em>, such that \(S_{n-1} &lt; u\), we can encode it in binary using \(\lceil\log u\rceil\) bits.
Elias-Fano coding splits each number into two parts, a low part consisting of \(l = \lceil\log \frac{u}{n}\rceil\) right-most bits, and a high part consisting of the remaining \(\lceil\log u\rceil - l\) left-most bits. The low parts are explicitly written in binary for all numbers, in a single stream of bits. The high parts are compressed by writing, in negative-unary form, the gaps between the high parts of consecutive numbers.</p>
<p>To compress an index using Elias-Fano use the index type <code>ef</code>.</p>
<blockquote>
<p>Sebastiano Vigna. 2013. Quasi-succinct indices. In Proceedings of the sixth ACM international conference on Web search and data mining (WSDM ‘13). ACM, New York, NY, USA, 83-92.</p>
</blockquote>
<h3 id="maskedvbyte"><a class="header" href="#maskedvbyte">MaskedVByte</a></h3>
<blockquote>
<p>Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015.</p>
</blockquote>
<h3 id="optpfd"><a class="header" href="#optpfd">OptPFD</a></h3>
<blockquote>
<p>Hao Yan, Shuai Ding, and Torsten Suel. 2009. Inverted index compression and query processing with optimized document ordering. In Proceedings of the 18th international conference on World wide web (WWW '09). ACM, New York, NY, USA, 401-410. DOI: https://doi.org/10.1145/1526709.1526764</p>
</blockquote>
<h3 id="partitioned-elias-fano"><a class="header" href="#partitioned-elias-fano">Partitioned Elias Fano</a></h3>
<blockquote>
<p>Giuseppe Ottaviano and Rossano Venturini. 2014. Partitioned Elias-Fano indexes. In Proceedings of the 37th international ACM SIGIR conference on Research &amp; development in information retrieval (SIGIR '14). ACM, New York, NY, USA, 273-282. DOI: https://doi.org/10.1145/2600428.2609615</p>
</blockquote>
<h3 id="qmx"><a class="header" href="#qmx">QMX</a></h3>
<p>Quantities, Multipliers, and eXtractor (QMX) packs as many integers as possible into 128-bit words (Quantities) and stores the selectors (eXtractors) separately in a different stream. The selectors are compressed (Multipliers) with
RLE (Run-Length Encoding).</p>
<p>To compress an index using QMX use the index type <code>block_qmx</code>.</p>
<blockquote>
<p>Andrew Trotman. 2014. Compression, SIMD, and Postings Lists. In Proceedings of the 2014 Australasian Document Computing Symposium (ADCS '14), J. Shane Culpepper, Laurence Park, and Guido Zuccon (Eds.). ACM, New York, NY, USA, Pages 50, 8 pages. DOI: https://doi.org/10.1145/2682862.2682870</p>
</blockquote>
<h3 id="simd-bp128"><a class="header" href="#simd-bp128">SIMD-BP128</a></h3>
<blockquote>
<p>Daniel Lemire, Leonid Boytsov: Decoding billions of integers per second through vectorization. Softw., Pract. Exper. 45(1): 1-29 (2015)</p>
</blockquote>
<h3 id="simple8b"><a class="header" href="#simple8b">Simple8b</a></h3>
<blockquote>
<p>Vo Ngoc Anh, Alistair Moffat: Index compression using 64-bit words. Softw., Pract. Exper. 40(2): 131-147 (2010)</p>
</blockquote>
<h3 id="simple16"><a class="header" href="#simple16">Simple16</a></h3>
<blockquote>
<p>Jiangong Zhang, Xiaohui Long, and Torsten Suel. 2008. Performance of compressed inverted list caching in search engines. In Proceedings of the 17th international conference on World Wide Web (WWW '08). ACM, New York, NY, USA, 387-396. DOI: https://doi.org/10.1145/1367497.1367550</p>
</blockquote>
<h3 id="streamvbyte"><a class="header" href="#streamvbyte">StreamVByte</a></h3>
<blockquote>
<p>Daniel Lemire, Nathan Kurz, Christoph Rupp: Stream VByte: Faster byte-oriented integer compression. Inf. Process. Lett. 130: 1-6 (2018). DOI: https://doi.org/10.1016/j.ipl.2017.09.011</p>
</blockquote>
<h3 id="varint-g8iu"><a class="header" href="#varint-g8iu">Varint-G8IU</a></h3>
<blockquote>
<p>Alexander A. Stepanov, Anil R. Gangolli, Daniel E. Rose, Ryan J. Ernst, and Paramjit S. Oberoi. 2011. SIMD-based decoding of posting lists. In Proceedings of the 20th ACM international conference on Information and knowledge management (CIKM '11), Bettina Berendt, Arjen de Vries, Wenfei Fan, Craig Macdonald, Iadh Ounis, and Ian Ruthven (Eds.). ACM, New York, NY, USA, 317-326. DOI: https://doi.org/10.1145/2063576.2063627</p>
</blockquote>
<h3 id="varintgb"><a class="header" href="#varintgb">VarintGB</a></h3>
<blockquote>
<p>Jeffrey Dean. 2009. Challenges in building large-scale information retrieval systems: invited talk. In Proceedings of the Second ACM International Conference on Web Search and Data Mining (WSDM '09), Ricardo Baeza-Yates, Paolo Boldi, Berthier Ribeiro-Neto, and B. Barla Cambazoglu (Eds.). ACM, New York, NY, USA, 1-1. DOI: http://dx.doi.org/10.1145/1498759.1498761</p>
</blockquote>

                    </main>

                    <nav class="nav-wrapper" aria-label="Page navigation">
                        <!-- Mobile navigation buttons -->
                            <a rel="prev" href="../guide/inverting.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
                                <i class="fa fa-angle-left"></i>
                            </a>

                            <a rel="next prefetch" href="../guide/wand_data.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
                                <i class="fa fa-angle-right"></i>
                            </a>

                        <div style="clear: both"></div>
                    </nav>
                </div>
            </div>

            <nav class="nav-wide-wrapper" aria-label="Page navigation">
                    <a rel="prev" href="../guide/inverting.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
                        <i class="fa fa-angle-left"></i>
                    </a>

                    <a rel="next prefetch" href="../guide/wand_data.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
                        <i class="fa fa-angle-right"></i>
                    </a>
            </nav>

        </div>




        <script>
            window.playground_copyable = true;
        </script>


        <script src="../elasticlunr.min.js"></script>
        <script src="../mark.min.js"></script>
        <script src="../searcher.js"></script>

        <script src="../clipboard.min.js"></script>
        <script src="../highlight.js"></script>
        <script src="../book.js"></script>

        <!-- Custom JS scripts -->


    </div>
    </body>
</html>
